Take-home Exercise 3

Creating data visualisation beyond default

Clarence Tay https://www.linkedin.com/in/clarencetay/ (Singapore Management University - MITB)https://scis.smu.edu.sg/master-it-business
2022-05-15

Overview

In this take-home exercise, I will…

Getting Started

Before I get started, it is important for us to ensure that the required R packages have been installed. If yes, we will load the R packages. If they have yet to be installed, I will install the R packages and load them onto R environment.

The chunk code below will do the trick.

packages = c('tidyverse','psych','plotly', 'ggiraph')

for(p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
    }
  library(p, character.only = T)
  }

Importing Data

The code chunk below imports Participants.csv from the data folder, into R by using read_csv() of readr and save it as an tibble dataframe called participants_data.

# emp_data <- read_csv("data/Employers.csv")

job_data <- read_csv("data/Jobs.csv")

fin_data <- read_csv("data/FinancialJournal.csv")

participants_data <- read_csv("data/Participants.csv")

pub_data <- read_csv("data/Pubs.csv")

restaurant_data <- read_csv("data/Restaurants.csv")

checkin_data <- read_csv("data/CheckinJournal.csv")


glimpse(checkin_data)
Rows: 2,100,635
Columns: 4
$ participantId <dbl> 619, 15, 23, 699, 876, 23, 154, 359, 509, 556,~
$ timestamp     <dttm> 2022-03-01 05:35:00, 2022-03-01 05:50:00, 202~
$ venueId       <dbl> 1798, 1798, 894, 1798, 1804, 895, 446, 893, 18~
$ venueType     <chr> "Pub", "Pub", "Pub", "Pub", "Restaurant", "Res~
table(fin_data$category)

     Education           Food     Recreation RentAdjustment 
          4060         974100         355359            131 
       Shelter           Wage 
         14103         508577 
des <- describe(fin_data, fast = TRUE)
print(des, digits=5)
              vars       n      mean       sd       min      max
participantId    1 1856330 480.81309 295.8256     0.000 1010.000
timestamp        2 1856330       NaN       NA       Inf     -Inf
category         3 1856330       NaN       NA       Inf     -Inf
amount           4 1856330  19.92172 108.0637 -1562.726 4096.526
                 range      se
participantId 1010.000 0.21712
timestamp         -Inf      NA
category          -Inf      NA
amount        5659.252 0.07931
ggplot(data=checkin_data, aes(x='participantId')) +
  geom_bar() +
  facet_wrap(~venueType)

merged_dataframe <- merge(x = participants_data, y = checkin_data, all.x = TRUE)
ggplotly(ggplot(data = merged_dataframe,
       aes(x=age, fill= venueType)) +
  geom_histogram(bins=20, color='black') +
  ggtitle("Histogram of Participants' age, filled by Education Level") +
  xlab('Age') +
  ylab('Count') +
  labs(fill="Education Level") + 
  scale_x_continuous(breaks = seq(10, 70, by = 2)) + 
  scale_y_continuous(breaks = seq(0, 100, by = 10)))
ggplot(data = merged_dataframe,
         aes(y=age, x = haveKids)) +
  geom_boxplot() +
  ggtitle("Box Plot of Participants' age across various Interest Group, grouped by Education") +
  xlab('Have Kids?') +
  ylab('Age') +
  facet_grid(educationLevel~venueType) +
  coord_flip()

merged_dataframe %>%
  mutate(`Education Level` = fct_relevel(educationLevel,"Low","HighSchoolOrCollege","Graduate","Bachelors")) %>%
  ggplot(aes(x = `Education Level`, 
             fill = venueType)) +
  geom_bar(position = 'fill') +
  geom_text(stat = 'count', 
            aes(label = stat(count)), 
            position = position_fill(vjust=0.9)) +
  scale_y_continuous(breaks = seq(0,1, by = 0.1), 
                     labels = scales::percent) +
  scale_x_discrete(labels = c("Low", "High School Or College", "Bachelors", "Graduate")) +
  labs(y = 'Percentage\nof\nParticipants', 
       title = "Percentage Distribution of Participants' Education Level", 
       subtitle ='With respect to where Participants are travelling to', 
       fill ='Venue') +
  theme(axis.title.y = element_text(angle = 0), 
        axis.ticks.x = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(color = 'grey'))

checkin_data %>%
  mutate(venueType) %>% 
  ggplot(aes(x=))
ggplot(data=checkin_data, aes(x='participantId')) +
  geom_bar() +
  facet_wrap(~venueType)

participants_data %>% mutate(Education Level = fct_relevel(educationLevel,“Low”,“HighSchoolOrCollege”,“Graduate”,“Bachelors”)) %>% ggplot(aes(x = Education Level, fill = haveKids)) + geom_bar(position = ‘fill’) + geom_text(stat = ‘count’, aes(label = stat(count)), position = position_fill(vjust=0.9)) + scale_y_continuous(breaks = seq(0,1, by = 0.1), labels = scales::percent) + scale_x_discrete(labels = c(“Low”, “High School Or College”, “Bachelors”, “Graduate”)) + labs(y = ‘Percentage’, title = “Percentage Distribution of Participants’ Education Level”, subtitle =‘With respect to having kids or not’, fill =‘Have Kids?’) + theme(axis.title.y = element_text(angle = 0), axis.ticks.x = element_blank(), panel.background = element_blank(), axis.line = element_line(color = ‘grey’))

merged_rest <- merge(x = checkin_data, y = restaurant_data, by.x = 'venueId', by.y = 'restaurantId')
merged_pub <- merge(x = checkin_data, y = pub_data, by.x = 'venueId', by.y = 'pubId')
merged_rest$Month_Yr <- format(as.Date(merged_rest$timestamp), "%Y-%m")

# need zoo package
# merged_rest$Month_Yr <- as.yearmon(merged_rest$timestamp))
glimpse(merged_rest)
Rows: 447,602
Columns: 9
$ venueId       <dbl> 445, 445, 445, 445, 445, 445, 445, 445, 445, 4~
$ participantId <dbl> 993, 164, 868, 247, 52, 223, 764, 546, 137, 16~
$ timestamp     <dttm> 2022-08-19 14:10:00, 2023-02-09 13:30:00, 202~
$ venueType     <chr> "Restaurant", "Restaurant", "Restaurant", "Res~
$ foodCost      <dbl> 5.15, 5.15, 5.15, 5.15, 5.15, 5.15, 5.15, 5.15~
$ maxOccupancy  <dbl> 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71~
$ location      <chr> "POINT (631.5130723031391 2001.4772026036535)"~
$ buildingId    <dbl> 304, 304, 304, 304, 304, 304, 304, 304, 304, 3~
$ Month_Yr      <chr> "2022-08", "2023-02", "2022-03", "2023-03", "2~
table(merged_rest$venueId)

  445   446   447   448   449   895   896   897   898   899  1345 
 8358 17890 34295 33109 41693 30355 20784 31276 12801 28898 26330 
 1346  1347  1348  1349  1801  1802  1803  1804  1805 
 6326  9468  9625  5077 48264 16049 12124 15685 39195 
merged_rest$venueId = as.factor(merged_rest$venueId)
p <- ggplot(data = merged_rest, aes(x = fct_infreq(venueId), fill = Month_Yr)) +
  geom_bar(stat="count",
           position = position_stack(reverse = TRUE),
           aes(text = paste0("Month_Yr: ", Month_Yr, "<br>", 
                             "Max Occupancy: ", maxOccupancy,"<br>",
                             "Building: ", buildingId,"<br>", 
                             "Food Cost: ", foodCost))) + 
  coord_flip() +
  # facet_wrap(~Month_Yr) +
  scale_y_continuous(breaks = seq(0,50000, by = 5000)) +
  labs(x= 'Restaurant ID',
       y = 'Count of Customers to Restaurants', 
       title = "XXX", 
       subtitle = 'YYY', 
       fill = 'Year-Month') +
  theme(axis.title.y = element_text(angle = 0), 
        #axis.ticks.x = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(color= 'grey'))

ggplotly(p, tooltip = c("x","text"))
p <- ggplot(data = merged_rest, aes(x = fct_infreq(venueId), fill = Month_Yr)) +
  geom_bar(stat="count",
           position = position_stack(reverse = TRUE),
           aes(text = paste0("Month_Yr: ", Month_Yr, "<br>", 
                             "Max Occupancy: ", maxOccupancy,"<br>",
                             "Building: ", buildingId,"<br>", 
                             "Food Cost: ", foodCost))) + 
  coord_flip() +
  facet_wrap(~Month_Yr) +
  scale_y_continuous(breaks = seq(0,50000, by = 5000)) +
  labs(x= 'Restaurant ID',
       y = 'Count of Customers to Restaurants', 
       title = "XXX", 
       subtitle = 'YYY', 
       fill = 'Year-Month') +
  theme(axis.title.y = element_text(angle = 0), 
        #axis.ticks.x = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(color= 'grey'),
        panel.margin.x = unit(1,"lines"),
        panel.margin.y = unit(2,"lines"))

ggplotly(p, tooltip = c("x","text"))
merged_pub$Month_Yr <- format(as.Date(merged_pub$timestamp), "%Y-%m")
glimpse(merged_pub)
Rows: 355,378
Columns: 9
$ venueId       <dbl> 442, 442, 442, 442, 442, 442, 442, 442, 442, 4~
$ participantId <dbl> 50, 948, 141, 952, 113, 583, 115, 830, 165, 83~
$ timestamp     <dttm> 2022-03-25 13:45:00, 2022-06-26 08:35:00, 202~
$ venueType     <chr> "Pub", "Pub", "Pub", "Pub", "Pub", "Pub", "Pub~
$ hourlyCost    <dbl> 8.281103, 8.281103, 8.281103, 8.281103, 8.2811~
$ maxOccupancy  <dbl> 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64~
$ location      <chr> "POINT (964.4380231713202 3991.603473784208)",~
$ buildingId    <dbl> 556, 556, 556, 556, 556, 556, 556, 556, 556, 5~
$ Month_Yr      <chr> "2022-03", "2022-06", "2022-07", "2022-03", "2~
table(merged_pub$venueId)

  442   443   444   892   893   894  1342  1343  1344  1798  1799 
21703 19807 17592 23603 25255 20253 50400 31162 54174 31212 28589 
 1800 
31628 
merged_pub$venueId = as.factor(merged_pub$venueId)

aes(text = paste0(“Month_Yr:”, Month_Yr, “
”, “Hourly Cost:”, hourlyCost))

, tooltip = “text”

p <- ggplot(data = merged_pub, aes(x = fct_infreq(venueId), fill = Month_Yr)) +
  geom_bar(stat="count", 
           position = position_stack(reverse = TRUE),
           aes(text = paste0("Month_Yr: ", Month_Yr, "<br>", 
                             "Max Occupancy: ", maxOccupancy,"<br>",
                             "Building: ", buildingId,"<br>", 
                             "Hourly Cost: ", hourlyCost))) + 
  coord_flip() +
  scale_y_continuous(breaks = seq(0,60000, by = 5000)) +
  labs(x= 'Pub ID',
       y = 'Count of Customers to Pubs', 
       title = "XXX", 
       subtitle = 'YYY', 
       fill = 'Year-Month') +
  theme(axis.title.y = element_text(angle = 0), 
        #axis.ticks.x = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(color= 'grey'))

ggplotly(p, tooltip = c("x","text"))
p <- ggplot(data = merged_pub, aes(x = fct_infreq(venueId), fill = Month_Yr)) +
  geom_bar(stat="count", 
           position = position_stack(reverse = TRUE),
           aes(text = paste0("Month_Yr: ", Month_Yr, "<br>", 
                             "Max Occupancy: ", maxOccupancy,"<br>",
                             "Building: ", buildingId,"<br>", 
                             "Hourly Cost: ", hourlyCost))) + 
  coord_flip() +
  facet_wrap(~Month_Yr) +
  scale_y_continuous(breaks = seq(0,60000, by = 5000)) +
  labs(x= 'Pub ID',
       y = 'Count of Customers to Pubs', 
       title = "XXX", 
       subtitle = 'YYY', 
       fill = 'Year-Month') +
  theme(axis.title.y = element_text(angle = 0), 
        #axis.ticks.x = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(color= 'grey'))

ggplotly(p, tooltip = c("x","text"))
p <- ggplot(data = merged_pub, aes(x = fct_infreq(venueId), fill = Month_Yr)) +
  geom_bar(stat="count", 
           position = position_stack(reverse = TRUE),
           aes(text = paste0("Month_Yr: ", Month_Yr, "<br>", 
                             "Max Occupancy: ", maxOccupancy,"<br>",
                             "Building: ", buildingId,"<br>", 
                             "Hourly Cost: ", hourlyCost))) + 
  coord_flip() +
  scale_y_continuous(breaks = seq(0,60000, by = 5000)) +
  labs(x= 'Pub ID',
       y = 'Count of Customers to Pubs', 
       title = "XXX", 
       subtitle = 'YYY', 
       fill = 'Year-Month') +
  theme(axis.title.y = element_text(angle = 0), 
        #axis.ticks.x = element_blank(),
        panel.background = element_blank(), 
        axis.line = element_line(color= 'grey'))

ggplotly(p, tooltip = c("x","text"))
ggplot(data = merged_rest, aes(x = fct_infreq(venueId), fill = Month_Yr)) +
  geom_bar(stat="count") + 
  coord_flip()